# =============================================================================
# Title:
# - False Discovery Rate (FDR) Control for Spatial Raster Data. 
#
# Authors:
# - Oliver Gutiérrez-Hernández, Department of Geography, University of Málaga (UMA) (Corresponding author: olivergh@uma.es).
# - Luis V. García, Institute of Natural Resources and Agrobiology of Seville (IRNAS), Spanish National Research Council (CSIC).
# 
#
# Objective:
# - Adjusting p-values using the Adaptive FDR control procedure,
#   through two alternative methods: the classical FDR-BH procedure (Benjamini & Hochberg, 1995)
#   and the adaptive FDR-BKY procedure (Benjamini, Krieger & Yekutieli, 2006).
#
# References:
# - Benjamini, Y., & Hochberg, Y. (1995). "Controlling the False Discovery Rate: A Practical and Powerful Approach to Multiple Testing."
#   Journal of the Royal Statistical Society: Series B (Methodological), 57, 289–300. https://doi.org/10.1111/j.2517-6161.1995.tb02031.x
#   → This foundational article introduces the concept of False Discovery Rate (FDR) and formally defines the widely used FDR-BH procedure.
#     The method assumes independent or positively dependent tests and initially considers the global null hypothesis (i.e., that all null hypotheses are true).
# - Benjamini, Y., Krieger, A. M., & Yekutieli, D. (2006). "Adaptive Linear Step-Up Procedures that Control the False Discovery Rate."
#   Biometrika, 93(3), 491–507. https://doi.org/10.1093/biomet/93.3.491
#   → This article extends the classical FDR-BH approach by introducing adaptive step-up procedures.
#     It estimates the proportion of true null hypotheses (π₀) to improve statistical power,
#     while maintaining FDR control under independence or positive dependence.
#     In practice, it is generally more powerful than the standard BH method, while being equally safe for FDR control.
#
# Practical Applications:
# - Suitable for rigorous multiple hypothesis testing on spatial raster data.
# - Particularly effective under positive dependency structures (see Benjamini & Yekutieli, 2001).
# - Applications include:
#     • Environmental sciences: spatial/temporal trends in climate variables.
#     • Remote sensing: trends in NDVI, land cover, or urban expansion.
#     • Public health: mapping disease, exposure, or spatial health risks.
#     • Agriculture: identifying significant trends in crop health or productivity.
#
# Input:
# - Raster file (GeoTIFF): a gridded p-value image (.tif), with values in [0, 1] and NA for invalid/missing data.
#
# Outputs:
# 1. Summary statistics:
#    - Valid pixels and percentages of significant results (raw and adjusted).
# 2. Graphical outputs:
#    - Threshold plots (FDR-BH and Adaptive FDR-BKY).
#    - Raster plots and bar plots of discoveries.
# 3. Raster files:
#    - Adjusted p-values and binary significance maps (stored in the same directory as the input file).
# =============================================================================

# --- Package installation (if needed) ---
# Define a utility function to install required packages only if they are not yet available.
# This improves portability and reproducibility across systems, and avoids unnecessary reinstalls.
# The script deliberately uses a minimal number of dependencies to ensure simplicity and maintainability.
# Although the Adaptive FDR-BKY procedure could be implemented manually as a standalone function,
# we chose to rely on a reference implementation provided by the 'cp4p' package (via 'multtest')
# to acknowledge the original authors and rely on a validated, peer-recognised solution widely cited in the literature.

install_if_missing <- function(package, bioc = FALSE) {
    if (!requireNamespace(package, quietly = TRUE)) {
        if (bioc) {
            if (!requireNamespace("BiocManager", quietly = TRUE)) {
                install.packages("BiocManager", dependencies = TRUE)
            }
            BiocManager::install(package, ask = FALSE, update = FALSE)
        } else {
            install.packages(package, dependencies = TRUE)
        }
    }
}

# --- Install required packages ---

# CRAN packages
# ⚠️ Install packages available from CRAN that are essential for raster handling and FDR procedures.
# 'terra' is used for geospatial raster operations, and 'cp4p' provides core FDR functionality.
install_if_missing("terra")       # For handling raster data
install_if_missing("cp4p")        # For applying FDR procedures, including Adaptive FDR-BKY via 'multtest'

# Bioconductor packages
# These are required dependencies for the 'cp4p' package.
# Specifically, 'multtest' provides the implementation of the adaptive step-up procedure.
# 'qvalue' and 'limma' are not used directly in this script but are required dependencies for 'cp4p'.
install_if_missing("multtest", bioc = TRUE)
install_if_missing("qvalue", bioc = TRUE)
install_if_missing("limma", bioc = TRUE)

# --- Load main packages used in the workflow ---
# Load the core libraries needed to run the workflow.
# 'terra' is used for spatial raster manipulation, while 'cp4p' provides the adaptive FDR-BKY method.
# The FDR-BH procedure is applied using the 'stats' package from base R (function: p.adjust, argument: method = "BH")
library(terra)   # For raster data handling
library(cp4p)    # For FDR estimation and control, including Adaptive FDR-BKY (via 'multtest')


# --- Load the p-values raster ---
# ⚠️ Replace with your actual file path and ensure the filename is correct and informative
# This is the primary input for the entire workflow; an incorrect path or file may lead to silent errors or misaligned outputs.
raster_path <- "C:/data/p-values.tif"
test_raster <- rast(raster_path)
raster_base_name <- tools::file_path_sans_ext(basename(raster_path))
output_dir <- dirname(raster_path)

# --- Optional: Duplicate original raster as backup ---
# This ensures the original data is preserved in case further operations
# (e.g., overwriting, thresholding or adjustments) need to be reverted or inspected later.
original_copy_path <- file.path(output_dir, paste0(raster_base_name, "_original.tif"))
writeRaster(test_raster, original_copy_path, overwrite = TRUE)

# --- Extract and validate values ---
# Extract pixel values from the raster and identify valid (non-NA) entries.
# This step ensures subsequent statistical computations are only applied to meaningful data,
# avoiding distortions due to missing or invalid values.
p_values <- values(test_raster)
valid_indices <- !is.na(p_values)
num_valid_pixels <- sum(valid_indices)
num_invalid_pixels <- sum(is.na(p_values))

# --- p-value adjustment ---
# Apply multiple testing correction to the p-values using two methods:
# - Benjamini-Hochberg (BH) to control the FDR assuming independence or positive dependence.
# - Adaptive FDR-BKY to improve power under adaptive estimation of the proportion of true nulls (π₀).
# The adjustment is applied only to valid (non-NA) pixels to avoid artefacts.
adjusted_p_values_bky <- rep(NA, length(p_values))
adjusted_p_values_bh <- rep(NA, length(p_values))
adjusted_p_values_bky[valid_indices] <- adjust.p(p_values[valid_indices], pi0.method = "bky", alpha = 0.05)$adjp[, 2]
adjusted_p_values_bh[valid_indices] <- p.adjust(p_values[valid_indices], method = "BH")

# --- Binary maps ---
# Create binary significance maps by thresholding p-values (raw and adjusted).
# This step helps visualise which pixels are considered statistically significant
# under each method. NA values are preserved to maintain spatial structure.
binary_original <- ifelse(p_values <= 0.05, 1, 0)
binary_bky <- ifelse(adjusted_p_values_bky <= 0.05, 1, 0)
binary_bh <- ifelse(adjusted_p_values_bh <= 0.05, 1, 0)
binary_original[!valid_indices] <- NA
binary_bky[!valid_indices] <- NA
binary_bh[!valid_indices] <- NA

# --- Create raster layers ---
# Reconstruct raster objects from the computed vectors (adjusted p-values and binary maps).
# This step is necessary to visualise and export the results while preserving geospatial attributes
# such as extent, resolution and coordinate reference system.
raster_adjusted_bky <- setValues(test_raster, adjusted_p_values_bky)
raster_adjusted_bh <- setValues(test_raster, adjusted_p_values_bh)
raster_binary_original <- setValues(test_raster, binary_original)
raster_binary_bky <- setValues(test_raster, binary_bky)
raster_binary_bh <- setValues(test_raster, binary_bh)

# --- Summary statistics ---
# Compute the number and percentage of significant discoveries under each method.
# This summary provides a quick overview of the extent of statistical significance
# in the spatial domain, allowing for comparison across procedures.
num_rejected_original <- sum(binary_original == 1, na.rm = TRUE)
num_rejected_bky <- sum(binary_bky == 1, na.rm = TRUE)
num_rejected_bh <- sum(binary_bh == 1, na.rm = TRUE)
perc_original <- (num_rejected_original / num_valid_pixels) * 100
perc_bky <- (num_rejected_bky / num_valid_pixels) * 100
perc_bh <- (num_rejected_bh / num_valid_pixels) * 100

cat("\n--- Summary Statistics ---\n")
cat("Valid pixels:", num_valid_pixels, "\n")
cat("Significant pixels (raw p-values < 0.05):", num_rejected_original, "(", round(perc_original, 2), "%)\n")
cat("Significant pixels (FDR-BH < 0.05):", num_rejected_bh, "(", round(perc_bh, 2), "%)\n")
cat("Significant pixels (Adaptive FDR-BKY < 0.05):", num_rejected_bky, "(", round(perc_bky, 2), "%)\n")

# --- FDR-BH and BKY plots ---
# Visualise the p-value distribution against their respective significance thresholds
# for both FDR-BH and Adaptive FDR-BKY. These plots help assess the number of discoveries
# and provide insight into the behaviour of each correction method relative to ranked p-values.
ordered_p_values <- sort(p_values[valid_indices])
m <- length(ordered_p_values)
q <- 0.05
thresh_bh <- (1:m) * q / m
res_bky <- adjust.p(ordered_p_values, pi0.method = "bky", alpha = q)
pi0_est <- res_bky$pi0
thresh_bky <- (1:m) * q / (m * pi0_est)

par(mfrow = c(1, 2), mar = c(5, 5, 4, 2) + 0.1, cex.lab = 1.2, cex.axis = 1.1, cex.main = 1.3)

plot(1:m, ordered_p_values, pch = 19, col = ifelse(ordered_p_values <= thresh_bh, 'blue', 'red'),
     xlab = expression("Rank"), ylab = expression(italic(p)*"-values"),
     main = expression("FDR-BH threshold (" * italic(q) * " = 0.05)"), ylim = c(0, 1))
lines(1:m, thresh_bh, col = 'skyblue', lwd = 2)
grid()

plot(1:m, ordered_p_values, pch = 19, col = ifelse(ordered_p_values <= thresh_bky, 'blue', 'red'),
     xlab = expression("Rank"), ylab = expression(italic(p)*"-values"),
     main = expression("Adaptive FDR-BKY threshold (" * italic(q) * " = 0.05)"),
     ylim = c(0, max(ordered_p_values, na.rm = TRUE)))
lines(1:m, thresh_bky, col = 'skyblue', lwd = 2)
grid()

# --- Bar plot function with labels ---
# Generate horizontal bar plots to summarise the number and proportion of discoveries.
# These plots complement the raster maps by providing an aggregated, quantitative view
# of significance detection under each method.
draw_bar <- function(sig, total, main_title) {
    bar_colors <- c("gray95", "blue")
    bar_labels <- rev(c("Discoveries", "Not discoveries"))
    bar_vals <- rev(c(sig, total - sig))
    bar_perc <- round(bar_vals / total * 100, 1)
    bar_text <- paste0(bar_vals, " (", bar_perc, "%)")
    bp <- barplot(bar_vals, names.arg = bar_labels, col = bar_colors,
                  main = main_title, xlab = "No. of tests", horiz = TRUE, las = 1, xlim = c(0, total),
                  xaxs = "i")
    text(x = bar_vals + total * 0.02, y = bp, labels = bar_text, pos = 4, cex = 1.2, col = "black")
}

# --- Bar charts ---
# Combine spatial maps with summary bar plots for each significance criterion.
# This dual visualisation helps interpret spatial patterns while providing
# a clear, aggregated comparison of the number of discoveries across methods.
par(mfrow = c(3, 2), mar = c(5, 5, 4, 2) + 0.1, cex.lab = 1.2, cex.axis = 1.1, cex.main = 1.3)
plot(raster_binary_original, main = expression("Significant (raw " * italic(p) * "-values, α = 0.05)"), col = c("gray95", "blue"), legend = FALSE)

draw_bar_raw <- function(sig, total, main_title) {
    bar_colors <- c("gray95", "blue")
    bar_labels <- rev(c("Significant", "Not significant"))
    bar_vals <- rev(c(sig, total - sig))
    bar_perc <- round(bar_vals / total * 100, 1)
    bar_text <- paste0(bar_vals, " (", bar_perc, "%)")
    bp <- barplot(bar_vals, names.arg = bar_labels, col = bar_colors,
                  main = main_title, xlab = "No. of tests", horiz = TRUE, las = 1, xlim = c(0, total),
                  xaxs = "i")
    text(x = bar_vals + total * 0.02, y = bp, labels = bar_text, pos = 4, cex = 1.2, col = "black")
}
draw_bar_raw(num_rejected_original, num_valid_pixels, expression("Significant (raw " * italic(p) * "-values, α = 0.05)"))
plot(raster_binary_bh, main = expression("Discoveries (FDR-BH, " * italic(q) * " = 0.05)"), col = c("gray95", "blue"), legend = FALSE)
draw_bar(num_rejected_bh, num_valid_pixels, expression("Discoveries (FDR-BH, " * italic(q) * " = 0.05)"))
plot(raster_binary_bky, main = expression("Discoveries (Adaptive FDR-BKY, " * italic(q) * " = 0.05)"), col = c("gray95", "blue"), legend = FALSE)
draw_bar(num_rejected_bky, num_valid_pixels, expression("Discoveries (Adaptive FDR-BKY, " * italic(q) * " = 0.05)"))

# --- File output (saved in input directory) ---
# Export all generated raster layers to disk for future analysis or integration.
# Files are named based on the input raster's base name to ensure traceability and consistency
# across adjusted p-values and binary significance maps.
writeRaster(raster_adjusted_bh, file.path(output_dir, paste0(raster_base_name, "_adjusted_FDR-BH.tif")), overwrite = TRUE)
writeRaster(raster_adjusted_bky, file.path(output_dir, paste0(raster_base_name, "_adjusted_FDR-adaptive.tif")), overwrite = TRUE)
writeRaster(raster_binary_original, file.path(output_dir, paste0(raster_base_name, "_binary_original.tif")), overwrite = TRUE)
writeRaster(raster_binary_bh, file.path(output_dir, paste0(raster_base_name, "_binary_FDR-BH.tif")), overwrite = TRUE)
writeRaster(raster_binary_bky, file.path(output_dir, paste0(raster_base_name, "_binary_FDR-adaptive.tif")), overwrite = TRUE)

cat("\n--- File Outputs ---\n")
cat("Original raster copy:", original_copy_path, "\n")
cat("Adjusted p-values raster (FDR-BH):", file.path(output_dir, paste0(raster_base_name, "_adjusted_FDR-BH.tif")), "\n")
cat("Adjusted p-values raster (Adaptive FDR):", file.path(output_dir, paste0(raster_base_name, "_adjusted_FDR-adaptive.tif")), "\n")
cat("Binary raster (raw p-values):", file.path(output_dir, paste0(raster_base_name, "_binary_original.tif")), "\n")
cat("Binary raster (FDR-BH):", file.path(output_dir, paste0(raster_base_name, "_binary_FDR-BH.tif")), "\n")
cat("Binary raster (Adaptive FDR):", file.path(output_dir, paste0(raster_base_name, "_binary_FDR-adaptive.tif")), "\n")
